import java.lang.foreign.Arena;
import java.lang.foreign.FunctionDescriptor;
import java.lang.foreign.Linker;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.SymbolLookup;
import java.lang.invoke.MethodHandle;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.VectorShuffle;
import jdk.vm.ci.code.site.DataPatch;
import jdk.vm.ci.code.site.Mark;
import jdk.vm.ci.code.site.Site;
import jdk.vm.ci.hotspot.HotSpotCompiledCode;
import jdk.vm.ci.hotspot.HotSpotCompiledNmethod;
import jdk.vm.ci.hotspot.HotSpotResolvedJavaMethod;
import jdk.vm.ci.meta.Assumptions;
import jdk.vm.ci.meta.ResolvedJavaMethod;
import jdk.vm.ci.runtime.JVMCI;
import jdk.vm.ci.runtime.JVMCICompiler;
import util.UnsafeUtil;
import static java.lang.foreign.ValueLayout.JAVA_FLOAT;
import static java.lang.foreign.ValueLayout.JAVA_INT;
import static java.lang.foreign.ValueLayout.JAVA_LONG;
import static java.nio.ByteOrder.nativeOrder;
import static jdk.incubator.vector.FloatVector.SPECIES_256;

// ref: https://github.com/apangin/nalim
// ref: https://github.com/JOML-CI/joml-bench
// ref: https://godbolt.org/
// ref: https://defuse.ca/online-x86-assembler.htm
// javac:
//   --add-modules jdk.incubator.vector
//   --add-modules jdk.internal.vm.ci
//   --add-exports jdk.internal.vm.ci/jdk.vm.ci.code=ALL-UNNAMED
//   --add-exports jdk.internal.vm.ci/jdk.vm.ci.code.site=ALL-UNNAMED
//   --add-exports jdk.internal.vm.ci/jdk.vm.ci.hotspot=ALL-UNNAMED
//   --add-exports jdk.internal.vm.ci/jdk.vm.ci.runtime=ALL-UNNAMED
//   --add-exports jdk.internal.vm.ci/jdk.vm.ci.meta=ALL-UNNAMED
// java:
//   --add-modules jdk.incubator.vector -XX:+UnlockExperimentalVMOptions -XX:+EnableJVMCI --enable-native-access=ALL-UNNAMED
public class TestNativeCode {
	static {
		System.setProperty("jdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK", "0");
		try {
			var m = UnsafeUtil.getMethod(Module.class, "implAddOpensToAllUnnamed", String.class);
			var jvmciModule = Class.forName("jdk.vm.ci.runtime.JVMCI").getModule();
			m.invoke(jvmciModule, "jdk.vm.ci.code");      // --add-opens jdk.internal.vm.ci/jdk.vm.ci.code=ALL-UNNAMED
			m.invoke(jvmciModule, "jdk.vm.ci.code.site"); // --add-opens jdk.internal.vm.ci/jdk.vm.ci.code.site=ALL-UNNAMED
			m.invoke(jvmciModule, "jdk.vm.ci.hotspot");   // --add-opens jdk.internal.vm.ci/jdk.vm.ci.hotspot=ALL-UNNAMED
			m.invoke(jvmciModule, "jdk.vm.ci.runtime");   // --add-opens jdk.internal.vm.ci/jdk.vm.ci.runtime=ALL-UNNAMED
			m.invoke(jvmciModule, "jdk.vm.ci.meta");      // --add-opens jdk.internal.vm.ci/jdk.vm.ci.meta=ALL-UNNAMED
		} catch (ReflectiveOperationException e) {
			throw new ExceptionInInitializerError(e);
		}
	}

	public static void linkCode(Method method, byte[] code) {
		assert (method.getModifiers() & (Modifier.STATIC | Modifier.NATIVE)) == (Modifier.STATIC | Modifier.NATIVE);
		var jvmci = JVMCI.getRuntime().getHostJVMCIBackend();
		var rm = jvmci.getMetaAccess().lookupJavaMethod(method);
		jvmci.getCodeCache().setDefaultCode(rm, new HotSpotCompiledNmethod(method.getName(), code, code.length,
				new Site[]{new Mark(code.length - 5, 7 /*ENTRY_BARRIER_PATCH*/)}, new Assumptions.Assumption[0],
				new ResolvedJavaMethod[0], new HotSpotCompiledCode.Comment[0], new byte[0], 1, new DataPatch[0],
				true, 0, null, (HotSpotResolvedJavaMethod)rm, JVMCICompiler.INVOCATION_ENTRY_BCI, 1, 0, false));
	}

	public static native void dummy();

	static {
		try {
			var code = new byte[]{
					0x48, (byte)0xb8, 0, 0, 0, 0, 0, 0, 0, 0, // mov rax, func_addr
					(byte)0xff, (byte)0xe0, // jmp rax
					// 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			};
			var lib = SymbolLookup.libraryLookup("add.dll", Arena.global());
			var addr = lib.find("addNaked").orElseThrow().address();
			ByteBuffer.wrap(code).order(ByteOrder.nativeOrder()).putLong(2, addr);
			linkCode(TestNativeCode.class.getMethod("dummy"), code);
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testDummyNaked() {
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			dummy();
		System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testDummyNaked", 0, System.nanoTime() - t);
	}

	public static native int add(int a, int b);

	static {
		try {
			var code = new byte[]{
					(byte)0x89, (byte)0xd1, // mov  ecx, edx
					0x44, (byte)0x89, (byte)0xc2, // mov  edx, r8d
					0x48, (byte)0xb8, 0, 0, 0, 0, 0, 0, 0, 0, // mov rax, func_addr
					(byte)0xff, (byte)0xe0, // jmp rax
					0, 0, 0, // 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			};
			var lib = SymbolLookup.libraryLookup("add.dll", Arena.global());
			var addr = lib.find("add").orElseThrow().address();
			ByteBuffer.wrap(code).order(ByteOrder.nativeOrder()).putLong(7, addr);
			linkCode(TestNativeCode.class.getMethod("add", int.class, int.class), code);
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testAdd() {
		int r = 0;
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			r = add(r, i);
		System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAdd", r, System.nanoTime() - t);
	}

	public static native int addNaked(int a, int b);

	static {
		try {
			var code = new byte[]{
					0x48, (byte)0xb8, 0, 0, 0, 0, 0, 0, 0, 0, // mov rax, func_addr
					(byte)0xff, (byte)0xe0, // jmp rax
					// 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			};
			var lib = SymbolLookup.libraryLookup("add.dll", Arena.global());
			var addr = lib.find("addNaked").orElseThrow().address();
			ByteBuffer.wrap(code).order(ByteOrder.nativeOrder()).putLong(2, addr);
			linkCode(TestNativeCode.class.getMethod("addNaked", int.class, int.class), code);
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testAddNaked() {
		int r = 0;
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			r = addNaked(r, i);
		System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAddNaked", r, System.nanoTime() - t);
	}

	private static final MethodHandle mhAdd;

	static {
		var lib = SymbolLookup.libraryLookup("add.dll", Arena.global());
		mhAdd = Linker.nativeLinker().downcallHandle(lib.find("add").orElseThrow(),
				FunctionDescriptor.of(JAVA_INT, JAVA_INT, JAVA_INT));
	}

	public static void testAddPanama() {
		try {
			int r = 0;
			var t = System.nanoTime();
			for (int i = 0; i < 1_000_000; i++)
				r = (int)mhAdd.invokeExact(r, i);
			System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAddPanama", r, System.nanoTime() - t);
		} catch (Throwable e) {
			throw new RuntimeException(e);
		}
	}

	private static final MethodHandle mhAddCritical;

	static {
		var lib = SymbolLookup.libraryLookup("add.dll", Arena.global());
		mhAddCritical = Linker.nativeLinker().downcallHandle(lib.find("add").orElseThrow(),
				FunctionDescriptor.of(JAVA_INT, JAVA_INT, JAVA_INT), Linker.Option.critical(false));
	}

	public static void testAddPanamaCritical() {
		try {
			int r = 0;
			var t = System.nanoTime();
			for (int i = 0; i < 1_000_000; i++)
				r = (int)mhAddCritical.invokeExact(r, i);
			System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAddPanamaCritical", r, System.nanoTime() - t);
		} catch (Throwable e) {
			throw new RuntimeException(e);
		}
	}

	public static void testAddJni() {
		try {
			int r = 0;
			var t = System.nanoTime();
			for (int i = 0; i < 1_000_000; i++)
				r = TestJNI.nativeAdd(r, i);
			System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAddJni", r, System.nanoTime() - t);
		} catch (Throwable e) {
			throw new RuntimeException(e);
		}
	}

	public static int addJava(int a, int b) {
		return a + b;
	}

	// -XX:-Inline
	public static void testAddJava() {
		try {
			int r = 0;
			var t = System.nanoTime();
			for (int i = 0; i < 1_000_000; i++)
				r = addJava(r, i);
			System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testAddJava", r, System.nanoTime() - t);
		} catch (Throwable e) {
			throw new RuntimeException(e);
		}
	}

	public static native long rdtsc();

	static {
		try {
			linkCode(TestNativeCode.class.getMethod("rdtsc"), new byte[]{
					0x0f, 0x31, // rdtsc
					0x48, (byte)0xc1, (byte)0xe2, 0x20, // shl rdx,32
					0x48, 0x09, (byte)0xd0, // or rax,rdx
					(byte)0xc3, // ret
					0, 0, // 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			});
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testRdtsc() {
		var t = System.nanoTime();
		long r = 0;
		for (int i = 0; i < 1_000_000; i++)
			r += rdtsc();
		System.err.format("%-24s:%13d, %8d ns/1M-invokes\n", "testRdtsc", r, System.nanoTime() - t);
	}

	public static native float rsqrtss(float a);

	static {
		try {
			linkCode(TestNativeCode.class.getMethod("rsqrtss", float.class), new byte[]{
					(byte)0xf3, 0x0f, 0x52, (byte)0xc0, // rsqrtss xmm0,xmm0
					(byte)0xc3, // ret
					0, 0, 0, // 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			});
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testRsqrtss() {
		var t = System.nanoTime();
		float r = 0;
		for (int i = 1; i <= 1_000_000; i++)
			r += rsqrtss(i);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testRsqrtss", r, System.nanoTime() - t);
	}

	public static native float rsqrtss1(float a);

	static {
		try {
			linkCode(TestNativeCode.class.getMethod("rsqrtss1", float.class), new byte[]{
					(byte)0xf3, 0x0f, 0x52, (byte)0xc8,          // 0x00: rsqrtss xmm1, xmm0
					(byte)0xf3, 0x0f, 0x59, (byte)0xc1,          // 0x04: mulss   xmm0, xmm1
					(byte)0xf3, 0x0f, 0x59, (byte)0xc1,          // 0x08: mulss   xmm0, xmm1
					(byte)0xf3, 0x0f, 0x59, 0x0d, 0x10, 0, 0, 0, // 0x0c: mulss   xmm1, dword ptr[rip+0x10] (0x24)
					(byte)0xf3, 0x0f, 0x58, 0x05, 0x0c, 0, 0, 0, // 0x14: addss   xmm0, dword ptr[rip+0x0c] (0x28)
					(byte)0xf3, 0x0f, 0x59, (byte)0xc1,          // 0x1c: mulss   xmm0, xmm1
					(byte)0xc3,                                  // 0x20: ret
					0, 0, 0,                                     // 0x21: 4-byte align padding
					0, 0, 0, (byte)0xbf,                         // 0x24: -0.5f
					0, 0, 0x40, (byte)0xc0,                      // 0x28: -3.0f
					// 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			});
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	public static void testRsqrtss1() {
		var t = System.nanoTime();
		float r = 0;
		for (int i = 1; i <= 1_000_000; i++)
			r += rsqrtss1(i);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testRsqrtss1", r, System.nanoTime() - t);
	}

	public static float rsqrtss1Java(float a) {
		float r = rsqrtss(a);
		return (a * r * r - 3) * (r * -0.5f);
	}

	public static void testRsqrtss1Java() {
		var t = System.nanoTime();
		float r = 0;
		for (int i = 1; i <= 1_000_000; i++)
			r += rsqrtss1Java(i);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testRsqrtss1Java", r, System.nanoTime() - t);
	}

	public static float rsqrtJava(float a) {
		return 1 / (float)Math.sqrt(a);
	}

	public static void testRsqrtJava() {
		var t = System.nanoTime();
		float r = 0;
		for (int i = 1; i <= 1_000_000; i++)
			r += rsqrtJava(i);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testRsqrtJava", r, System.nanoTime() - t);
	}

	// |  0  1  2  3 |
	// |  4  5  6  7 |
	// |  8  9 10 11 | m0 * m1 => m2
	// | 12 13 14 15 | m2[0] = m0[0,1,2,3] *+ m1[0,4,8,12]; ...
	public static native float matMul256(long m0, long m1, long m2); // need AVX

	static {
		try {
			linkCode(TestNativeCode.class.getMethod("matMul256", long.class, long.class, long.class), new byte[]{
					(byte)0xC5, (byte)0xFC, (byte)0x10, (byte)0x02, (byte)0xC5, (byte)0xFC, (byte)0x10, (byte)0x4A,
					(byte)0x20, (byte)0xC4, (byte)0xC2, (byte)0x7D, (byte)0x1A, (byte)0x20, (byte)0xC4, (byte)0xC3,
					(byte)0x4D, (byte)0x06, (byte)0x30, (byte)0x33, (byte)0xC5, (byte)0xFC, (byte)0xC6, (byte)0xD0,
					(byte)0x00, (byte)0xC5, (byte)0xF4, (byte)0xC6, (byte)0xD9, (byte)0x00, (byte)0xC5, (byte)0xEC,
					(byte)0x59, (byte)0xD4, (byte)0xC5, (byte)0xE4, (byte)0x59, (byte)0xDC, (byte)0xC5, (byte)0xFC,
					(byte)0xC6, (byte)0xE0, (byte)0x55, (byte)0xC5, (byte)0xF4, (byte)0xC6, (byte)0xE9, (byte)0x55,
					(byte)0xC4, (byte)0xE2, (byte)0x5D, (byte)0xB8, (byte)0xD6, (byte)0xC4, (byte)0xE2, (byte)0x55,
					(byte)0xB8, (byte)0xDE, (byte)0xC5, (byte)0xFC, (byte)0xC6, (byte)0xE0, (byte)0xAA, (byte)0xC5,
					(byte)0xF4, (byte)0xC6, (byte)0xE9, (byte)0xAA, (byte)0xC4, (byte)0xC2, (byte)0x7D, (byte)0x1A,
					(byte)0x70, (byte)0x20, (byte)0xC4, (byte)0xE2, (byte)0x6D, (byte)0x98, (byte)0xE6, (byte)0xC4,
					(byte)0xE2, (byte)0x65, (byte)0x98, (byte)0xEE, (byte)0xC5, (byte)0xFC, (byte)0xC6, (byte)0xC0,
					(byte)0xFF, (byte)0xC5, (byte)0xF4, (byte)0xC6, (byte)0xC9, (byte)0xFF, (byte)0xC4, (byte)0xC3,
					(byte)0x4D, (byte)0x06, (byte)0x70, (byte)0x20, (byte)0x33, (byte)0xC4, (byte)0xE2, (byte)0x5D,
					(byte)0x98, (byte)0xC6, (byte)0xC4, (byte)0xE2, (byte)0x55, (byte)0x98, (byte)0xCE, (byte)0xC4,
					(byte)0xC1, (byte)0x7C, (byte)0x11, (byte)0x01, (byte)0xC4, (byte)0xC1, (byte)0x7C, (byte)0x11,
					(byte)0x49, (byte)0x20, (byte)0xC5, (byte)0xF8, (byte)0x77, (byte)0xC3,
					0, 0, // 4-byte align padding
					0x41, (byte)0x81, 0x7f, 0, 0 // barrier
			});
/* only for windows and x64 with AVX & FMA3
vmovups			ymm0, [rdx]
vmovups			ymm1, [rdx+0x20]
vshufps			ymm2, ymm0, ymm0, 0
vshufps			ymm3, ymm1, ymm1, 0
vbroadcastf128	ymm4, [r8]
vmulps			ymm2, ymm2, ymm4
vmulps			ymm3, ymm3, ymm4
vshufps			ymm4, ymm0, ymm0, 0x55
vshufps			ymm5, ymm1, ymm1, 0x55
vperm2f128		ymm6, ymm0, [r8], 0x33
vfmadd231ps		ymm2, ymm4, ymm6
vfmadd231ps		ymm3, ymm5, ymm6
vshufps			ymm4, ymm0, ymm0, 0xaa
vshufps			ymm5, ymm1, ymm1, 0xaa
vbroadcastf128	ymm6, [r8+0x20]
vfmadd132ps		ymm4, ymm2, ymm6
vfmadd132ps		ymm5, ymm3, ymm6
vshufps			ymm0, ymm0, ymm0, 0xff
vshufps			ymm1, ymm1, ymm1, 0xff
vperm2f128		ymm6, ymm0, [r8+0x20], 0x33
vfmadd132ps		ymm0, ymm4, ymm6
vfmadd132ps		ymm1, ymm5, ymm6
vmovups			[r9], ymm0
vmovups			[r9+0x20], ymm1
vzeroupper
ret
*/
		} catch (NoSuchMethodException e) {
			throw new RuntimeException(e);
		}
	}

	private static final VectorShuffle<Float> s01230123 = SPECIES_256.shuffleFromValues(0, 1, 2, 3, 0, 1, 2, 3); // _mm256_permute2f128_ps(..., 0x00);
	private static final VectorShuffle<Float> s45674567 = SPECIES_256.shuffleFromValues(4, 5, 6, 7, 4, 5, 6, 7); // _mm256_permute2f128_ps(..., 0x11);
	private static final VectorShuffle<Float> s00004444 = SPECIES_256.shuffleFromValues(0, 0, 0, 0, 4, 4, 4, 4); // _MM_SHUFFLE(0, 0, 0, 0)
	private static final VectorShuffle<Float> s11115555 = SPECIES_256.shuffleFromValues(1, 1, 1, 1, 5, 5, 5, 5); // _MM_SHUFFLE(1, 1, 1, 1)
	private static final VectorShuffle<Float> s22226666 = SPECIES_256.shuffleFromValues(2, 2, 2, 2, 6, 6, 6, 6); // _MM_SHUFFLE(2, 2, 2, 2)
	private static final VectorShuffle<Float> s33337777 = SPECIES_256.shuffleFromValues(3, 3, 3, 3, 7, 7, 7, 7); // _MM_SHUFFLE(3, 3, 3, 3)

	public static void matMul256(MemorySegment m0, MemorySegment m1, MemorySegment m2) {
		var t0 = FloatVector.fromMemorySegment(SPECIES_256, m0, 0, nativeOrder());
		var t1 = FloatVector.fromMemorySegment(SPECIES_256, m0, 32, nativeOrder());
		var u0 = FloatVector.fromMemorySegment(SPECIES_256, m1, 0, nativeOrder());
		var u1 = FloatVector.fromMemorySegment(SPECIES_256, m1, 32, nativeOrder());
		var u0r00 = u0.rearrange(s01230123);
		var u1r00 = u1.rearrange(s01230123);
		var u0r11 = u0.rearrange(s45674567);
		var u1r11 = u1.rearrange(s45674567);
		t0.rearrange(s00004444).fma(u0r00, t0.rearrange(s11115555).mul(u0r11))
				.add(t0.rearrange(s33337777).fma(u1r11, t0.rearrange(s22226666).mul(u1r00)))
				.intoMemorySegment(m2, 0, nativeOrder());
		t1.rearrange(s00004444).fma(u0r00, t1.rearrange(s11115555).mul(u0r11))
				.add(t1.rearrange(s33337777).fma(u1r11, t1.rearrange(s22226666).mul(u1r00)))
				.intoMemorySegment(m2, 32, nativeOrder());
	}

	public static void matMulJava(MemorySegment m0, MemorySegment m1, MemorySegment m2) {
		float f0, f1, f2, f3;
		//@formatter:off
		f0 = m0.get(JAVA_FLOAT, 0); f1 = m0.get(JAVA_FLOAT, 4); f2 = m0.get(JAVA_FLOAT, 8); f3 = m0.get(JAVA_FLOAT, 12);
		m2.set(JAVA_FLOAT,  0, f0 * m1.get(JAVA_FLOAT,  0) + f1 * m1.get(JAVA_FLOAT, 16) + f2 * m1.get(JAVA_FLOAT, 32) + f3 * m1.get(JAVA_FLOAT, 48));
		m2.set(JAVA_FLOAT,  4, f0 * m1.get(JAVA_FLOAT,  4) + f1 * m1.get(JAVA_FLOAT, 20) + f2 * m1.get(JAVA_FLOAT, 36) + f3 * m1.get(JAVA_FLOAT, 52));
		m2.set(JAVA_FLOAT,  8, f0 * m1.get(JAVA_FLOAT,  8) + f1 * m1.get(JAVA_FLOAT, 24) + f2 * m1.get(JAVA_FLOAT, 40) + f3 * m1.get(JAVA_FLOAT, 56));
		m2.set(JAVA_FLOAT, 12, f0 * m1.get(JAVA_FLOAT, 12) + f1 * m1.get(JAVA_FLOAT, 28) + f2 * m1.get(JAVA_FLOAT, 44) + f3 * m1.get(JAVA_FLOAT, 60));
		f0 = m0.get(JAVA_FLOAT, 16); f1 = m0.get(JAVA_FLOAT, 20); f2 = m0.get(JAVA_FLOAT, 24); f3 = m0.get(JAVA_FLOAT, 28);
		m2.set(JAVA_FLOAT, 16, f0 * m1.get(JAVA_FLOAT,  0) + f1 * m1.get(JAVA_FLOAT, 16) + f2 * m1.get(JAVA_FLOAT, 32) + f3 * m1.get(JAVA_FLOAT, 48));
		m2.set(JAVA_FLOAT, 20, f0 * m1.get(JAVA_FLOAT,  4) + f1 * m1.get(JAVA_FLOAT, 20) + f2 * m1.get(JAVA_FLOAT, 36) + f3 * m1.get(JAVA_FLOAT, 52));
		m2.set(JAVA_FLOAT, 24, f0 * m1.get(JAVA_FLOAT,  8) + f1 * m1.get(JAVA_FLOAT, 24) + f2 * m1.get(JAVA_FLOAT, 40) + f3 * m1.get(JAVA_FLOAT, 56));
		m2.set(JAVA_FLOAT, 28, f0 * m1.get(JAVA_FLOAT, 12) + f1 * m1.get(JAVA_FLOAT, 28) + f2 * m1.get(JAVA_FLOAT, 44) + f3 * m1.get(JAVA_FLOAT, 60));
		f0 = m0.get(JAVA_FLOAT, 32); f1 = m0.get(JAVA_FLOAT, 36); f2 = m0.get(JAVA_FLOAT, 40); f3 = m0.get(JAVA_FLOAT, 44);
		m2.set(JAVA_FLOAT, 32, f0 * m1.get(JAVA_FLOAT,  0) + f1 * m1.get(JAVA_FLOAT, 16) + f2 * m1.get(JAVA_FLOAT, 32) + f3 * m1.get(JAVA_FLOAT, 48));
		m2.set(JAVA_FLOAT, 36, f0 * m1.get(JAVA_FLOAT,  4) + f1 * m1.get(JAVA_FLOAT, 20) + f2 * m1.get(JAVA_FLOAT, 36) + f3 * m1.get(JAVA_FLOAT, 52));
		m2.set(JAVA_FLOAT, 40, f0 * m1.get(JAVA_FLOAT,  8) + f1 * m1.get(JAVA_FLOAT, 24) + f2 * m1.get(JAVA_FLOAT, 40) + f3 * m1.get(JAVA_FLOAT, 56));
		m2.set(JAVA_FLOAT, 44, f0 * m1.get(JAVA_FLOAT, 12) + f1 * m1.get(JAVA_FLOAT, 28) + f2 * m1.get(JAVA_FLOAT, 44) + f3 * m1.get(JAVA_FLOAT, 60));
		f0 = m0.get(JAVA_FLOAT, 48); f1 = m0.get(JAVA_FLOAT, 52); f2 = m0.get(JAVA_FLOAT, 56); f3 = m0.get(JAVA_FLOAT, 60);
		m2.set(JAVA_FLOAT, 48, f0 * m1.get(JAVA_FLOAT,  0) + f1 * m1.get(JAVA_FLOAT, 16) + f2 * m1.get(JAVA_FLOAT, 32) + f3 * m1.get(JAVA_FLOAT, 48));
		m2.set(JAVA_FLOAT, 52, f0 * m1.get(JAVA_FLOAT,  4) + f1 * m1.get(JAVA_FLOAT, 20) + f2 * m1.get(JAVA_FLOAT, 36) + f3 * m1.get(JAVA_FLOAT, 52));
		m2.set(JAVA_FLOAT, 56, f0 * m1.get(JAVA_FLOAT,  8) + f1 * m1.get(JAVA_FLOAT, 24) + f2 * m1.get(JAVA_FLOAT, 40) + f3 * m1.get(JAVA_FLOAT, 56));
		m2.set(JAVA_FLOAT, 60, f0 * m1.get(JAVA_FLOAT, 12) + f1 * m1.get(JAVA_FLOAT, 28) + f2 * m1.get(JAVA_FLOAT, 44) + f3 * m1.get(JAVA_FLOAT, 60));
		//@formatter:on
	}

	public static void matMulJavaFma(MemorySegment m0, MemorySegment m1, MemorySegment m2) {
		float f0, f1, f2, f3;
		//@formatter:off
		f0 = m0.get(JAVA_FLOAT, 0); f1 = m0.get(JAVA_FLOAT, 4); f2 = m0.get(JAVA_FLOAT, 8); f3 = m0.get(JAVA_FLOAT, 12);
		m2.set(JAVA_FLOAT,  0, Math.fma(f0, m1.get(JAVA_FLOAT,  0), Math.fma(f1, m1.get(JAVA_FLOAT, 16), Math.fma(f2, m1.get(JAVA_FLOAT, 32), f3 * m1.get(JAVA_FLOAT, 48)))));
		m2.set(JAVA_FLOAT,  4, Math.fma(f0, m1.get(JAVA_FLOAT,  4), Math.fma(f1, m1.get(JAVA_FLOAT, 20), Math.fma(f2, m1.get(JAVA_FLOAT, 36), f3 * m1.get(JAVA_FLOAT, 52)))));
		m2.set(JAVA_FLOAT,  8, Math.fma(f0, m1.get(JAVA_FLOAT,  8), Math.fma(f1, m1.get(JAVA_FLOAT, 24), Math.fma(f2, m1.get(JAVA_FLOAT, 40), f3 * m1.get(JAVA_FLOAT, 56)))));
		m2.set(JAVA_FLOAT, 12, Math.fma(f0, m1.get(JAVA_FLOAT, 12), Math.fma(f1, m1.get(JAVA_FLOAT, 28), Math.fma(f2, m1.get(JAVA_FLOAT, 44), f3 * m1.get(JAVA_FLOAT, 60)))));
		f0 = m0.get(JAVA_FLOAT, 16); f1 = m0.get(JAVA_FLOAT, 20); f2 = m0.get(JAVA_FLOAT, 24); f3 = m0.get(JAVA_FLOAT, 28);
		m2.set(JAVA_FLOAT, 16, Math.fma(f0, m1.get(JAVA_FLOAT,  0), Math.fma(f1, m1.get(JAVA_FLOAT, 16), Math.fma(f2, m1.get(JAVA_FLOAT, 32), f3 * m1.get(JAVA_FLOAT, 48)))));
		m2.set(JAVA_FLOAT, 20, Math.fma(f0, m1.get(JAVA_FLOAT,  4), Math.fma(f1, m1.get(JAVA_FLOAT, 20), Math.fma(f2, m1.get(JAVA_FLOAT, 36), f3 * m1.get(JAVA_FLOAT, 52)))));
		m2.set(JAVA_FLOAT, 24, Math.fma(f0, m1.get(JAVA_FLOAT,  8), Math.fma(f1, m1.get(JAVA_FLOAT, 24), Math.fma(f2, m1.get(JAVA_FLOAT, 40), f3 * m1.get(JAVA_FLOAT, 56)))));
		m2.set(JAVA_FLOAT, 28, Math.fma(f0, m1.get(JAVA_FLOAT, 12), Math.fma(f1, m1.get(JAVA_FLOAT, 28), Math.fma(f2, m1.get(JAVA_FLOAT, 44), f3 * m1.get(JAVA_FLOAT, 60)))));
		f0 = m0.get(JAVA_FLOAT, 32); f1 = m0.get(JAVA_FLOAT, 36); f2 = m0.get(JAVA_FLOAT, 40); f3 = m0.get(JAVA_FLOAT, 44);
		m2.set(JAVA_FLOAT, 32, Math.fma(f0, m1.get(JAVA_FLOAT,  0), Math.fma(f1, m1.get(JAVA_FLOAT, 16), Math.fma(f2, m1.get(JAVA_FLOAT, 32), f3 * m1.get(JAVA_FLOAT, 48)))));
		m2.set(JAVA_FLOAT, 36, Math.fma(f0, m1.get(JAVA_FLOAT,  4), Math.fma(f1, m1.get(JAVA_FLOAT, 20), Math.fma(f2, m1.get(JAVA_FLOAT, 36), f3 * m1.get(JAVA_FLOAT, 52)))));
		m2.set(JAVA_FLOAT, 40, Math.fma(f0, m1.get(JAVA_FLOAT,  8), Math.fma(f1, m1.get(JAVA_FLOAT, 24), Math.fma(f2, m1.get(JAVA_FLOAT, 40), f3 * m1.get(JAVA_FLOAT, 56)))));
		m2.set(JAVA_FLOAT, 44, Math.fma(f0, m1.get(JAVA_FLOAT, 12), Math.fma(f1, m1.get(JAVA_FLOAT, 28), Math.fma(f2, m1.get(JAVA_FLOAT, 44), f3 * m1.get(JAVA_FLOAT, 60)))));
		f0 = m0.get(JAVA_FLOAT, 48); f1 = m0.get(JAVA_FLOAT, 52); f2 = m0.get(JAVA_FLOAT, 56); f3 = m0.get(JAVA_FLOAT, 60);
		m2.set(JAVA_FLOAT, 48, Math.fma(f0, m1.get(JAVA_FLOAT,  0), Math.fma(f1, m1.get(JAVA_FLOAT, 16), Math.fma(f2, m1.get(JAVA_FLOAT, 32), f3 * m1.get(JAVA_FLOAT, 48)))));
		m2.set(JAVA_FLOAT, 52, Math.fma(f0, m1.get(JAVA_FLOAT,  4), Math.fma(f1, m1.get(JAVA_FLOAT, 20), Math.fma(f2, m1.get(JAVA_FLOAT, 36), f3 * m1.get(JAVA_FLOAT, 52)))));
		m2.set(JAVA_FLOAT, 56, Math.fma(f0, m1.get(JAVA_FLOAT,  8), Math.fma(f1, m1.get(JAVA_FLOAT, 24), Math.fma(f2, m1.get(JAVA_FLOAT, 40), f3 * m1.get(JAVA_FLOAT, 56)))));
		m2.set(JAVA_FLOAT, 60, Math.fma(f0, m1.get(JAVA_FLOAT, 12), Math.fma(f1, m1.get(JAVA_FLOAT, 28), Math.fma(f2, m1.get(JAVA_FLOAT, 44), f3 * m1.get(JAVA_FLOAT, 60)))));
		//@formatter:on
	}

	public static void printMat(MemorySegment m) {
		System.out.format("%12.6f %12.6f %12.6f %12.6f\n", m.get(JAVA_FLOAT, 0), m.get(JAVA_FLOAT, 4), m.get(JAVA_FLOAT, 8), m.get(JAVA_FLOAT, 12));
		System.out.format("%12.6f %12.6f %12.6f %12.6f\n", m.get(JAVA_FLOAT, 16), m.get(JAVA_FLOAT, 20), m.get(JAVA_FLOAT, 24), m.get(JAVA_FLOAT, 28));
		System.out.format("%12.6f %12.6f %12.6f %12.6f\n", m.get(JAVA_FLOAT, 32), m.get(JAVA_FLOAT, 36), m.get(JAVA_FLOAT, 40), m.get(JAVA_FLOAT, 44));
		System.out.format("%12.6f %12.6f %12.6f %12.6f\n", m.get(JAVA_FLOAT, 48), m.get(JAVA_FLOAT, 52), m.get(JAVA_FLOAT, 56), m.get(JAVA_FLOAT, 60));
		System.out.println("---");
	}

	private static final MemorySegment m0, m1, m2;

	static {
		m0 = Arena.global().allocate(64, 64);
		m1 = Arena.global().allocate(64, 64);
		m2 = Arena.global().allocate(64, 64);
		for (int i = 0; i < 64; i += 4) {
			m0.set(JAVA_FLOAT, i, (float)i / 4 + 2);
			m1.set(JAVA_FLOAT, i, 17 - (float)i / 4);
		}
	}

	public static void testMatMulJvmci() {
		long p0 = m0.address();
		long p1 = m1.address();
		long p2 = m2.address();
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			matMul256(p0, p1, p2);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testMatMulJvmci", 0f, System.nanoTime() - t);
		// printMat(m2);
	}

	private static final MethodHandle mhMatMulAvx;

	static {
		var lib = SymbolLookup.libraryLookup("vec_x64.dll", Arena.global());
		mhMatMulAvx = Linker.nativeLinker().downcallHandle(lib.find("matMulAvx").orElseThrow(),
				FunctionDescriptor.ofVoid(JAVA_LONG, JAVA_LONG, JAVA_LONG), Linker.Option.critical(false));
	}

	public static void testMatMulPanamaCritical() {
		try {
			long p0 = m0.address();
			long p1 = m1.address();
			long p2 = m2.address();
			var t = System.nanoTime();
			for (int i = 0; i < 1_000_000; i++)
				mhMatMulAvx.invokeExact(p0, p1, p2);
			System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testMatMulPanamaCritical", 0f, System.nanoTime() - t);
			// printMat(m2);
		} catch (Throwable e) {
			throw new RuntimeException(e);
		}
	}

	public static void testMatMulVector() {
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			matMul256(m0, m1, m2);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testMatMulVector", 0f, System.nanoTime() - t);
		// printMat(m2);
	}

	public static void testMatMulJava() {
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			matMulJava(m0, m1, m2);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testMatMulJava", 0f, System.nanoTime() - t);
		// printMat(m2);
	}

	public static void testMatMulJavaFma() {
		var t = System.nanoTime();
		for (int i = 0; i < 1_000_000; i++)
			matMulJavaFma(m0, m1, m2);
		System.err.format("%-24s:%13f, %8d ns/1M-invokes\n", "testMatMulJavaFma", 0f, System.nanoTime() - t);
		// printMat(m2);
	}

	public static void main(String[] args) {
		for (int i = 0; i < 10; i++) {
			System.err.println("---");  // OpenJDK23
			testDummyNaked();           // 1.4005 ns
			testAddNaked();             // 1.6334 ns
			testAdd();                  // 1.6333 ns
			testAddPanamaCritical();    // 2.5720 ns (static final)
			testAddPanama();            // 7.7158 ns (static final)
			testAddJni();               // 8.4009 ns
			testAddJava();              // 1.2278 ns (-XX:-Inline) 0.2404 ns (default)

			testRsqrtss();              // 2.1006 ns (low precision)
			testRsqrtss1();             // 5.6468 ns (good precision)
			testRsqrtss1Java();         // 2.6830 ns (-XX:-Inline) 2.1009 ns (default) (good precision)
			testRsqrtJava();            // 2.2459 ns (-XX:-Inline) 1.4010 ns (default) (best precision)

			testRdtsc();                // 5.9552 ns

			testMatMulJvmci();          // 2.5668 ns
			testMatMulPanamaCritical(); // 3.7344 ns
			testMatMulVector();         // 5.6001 ns
			testMatMulJava();           //27.1248 ns
			testMatMulJavaFma();        //24.7380 ns
		}
	}
}
/*
 x64 calling convention (Linux, macOS):
	 Java:      rsi, rdx, rcx,  r8,  r9, rdi, stack
   Native: rdi, rsi, rdx, rcx,  r8,  r9,      stack
 x64 calling convention (Windows):
	 Java:      rdx,  r8,  r9, rdi, rsi, rcx, stack
   Native: rcx, rdx,  r8,  r9,                stack
 AArch64 calling convention:
	 Java:     x1, x2, x3, x4, x5, x6, x7, x0, stack
   Native: x0, x1, x2, x3, x4, x5, x6, x7,     stack
 RISCV64 calling convention:
	 Java: x10, x11, x12, x13, x14, x15, x16, x17, stack
   Native: x10, x11, x12, x13, x14, x15, x16, x17, stack
*/
